library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ readr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(fastDummies)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
data<-read.table("base_trabajo_segmentacion.csv" ,sep = ";",header = TRUE)
data<-na.omit(data)

#Modificando las variables
datos_reducidos=mutate(data, en_vm_otros_ = en_vm_canal6+en_vm_canal7+en_vm_canal8+en_vm_canal9+en_vm_canal10+en_vm_otros,en_tx_otros_=en_tx_canal6+en_tx_canal7+en_tx_canal8+en_tx_canal9+en_tx_canal10+en_tx_otros)




borrar <- c("nit","en_vm_canal6","en_vm_canal7","en_vm_canal8","en_vm_canal9","en_vm_canal10","en_vm_otros","en_tx_canal6","en_tx_canal7","en_tx_canal8","en_tx_canal9","en_tx_canal10","en_tx_otros")
datos2 <- datos_reducidos[ , !(names(datos_reducidos) %in% borrar)]#En este paso se eliminan las variables sumadas ateriormente

datos_categoricos_seleccionados=datos2[19:34]#Se seleccionan y almacenan los datos categóricos

datos_categoricos=c("impo_cv","expo_vt","cxp","cxc","totalinventory","pagos_pj","pagos_pn","tiene_ventas_fisicas","tiene_ventas_electronicas","recaudos_pj","recaudos_pn","rotacion_inventarios","rotacion_cxc","rotacion_cxp","ciclo_negocio","ciclo_financiero")

datos_numericos <- datos2[ , !(names(datos2) %in% datos_categoricos)]#Para hacer los primeros análisi se dejan únicamente las variables cuantitativas del modelo
Sigma_t<-cov(scale(datos_numericos,center=T,scale=T))
descomp_espectr_t<-eigen(Sigma_t)
lambdas_t<-descomp_espectr_t$values
D_t<-descomp_espectr_t$vectors
#En caso de ser necesario, se hará un análisis del ACP
acprincipales=prcomp(datos_numericos,scale=T)
acprincipales
## Standard deviations (1, .., p=20):
##  [1] 2.50011551 1.58117656 1.44141911 1.23849358 1.08063358 1.04720979
##  [7] 1.01300694 0.96792533 0.86610611 0.79281554 0.73493703 0.62590255
## [13] 0.62156148 0.56804575 0.43800337 0.32844986 0.22978995 0.18026508
## [19] 0.07335472 0.01626347
## 
## Rotation (n x k) = (20 x 20):
##                       PC1          PC2         PC3          PC4         PC5
## en_vm_canal1  0.196111232  0.298677762 -0.01758397 -0.070098205  0.38572320
## en_vm_canal2  0.269752007  0.142552018 -0.05219006 -0.006563866  0.22423646
## en_vm_canal3  0.201450180  0.387128867 -0.14771157  0.046934354 -0.16864279
## en_vm_canal4  0.297846107 -0.366537491 -0.04855269  0.034662152 -0.03653591
## en_vm_canal5  0.355720357 -0.135357501 -0.04741409  0.023337027 -0.14771284
## en_tx_canal1  0.161277692  0.336792134 -0.10955874  0.035322237  0.12588672
## en_tx_canal2  0.199525929  0.080423283  0.12993596 -0.116914463  0.51261720
## en_tx_canal3  0.153102310  0.360694034 -0.16532312  0.094299212 -0.44513141
## en_tx_canal4  0.293207788 -0.385478742 -0.04872198  0.039279168 -0.07593586
## en_tx_canal5  0.130764803  0.084102588  0.55763055 -0.177644631 -0.24835955
## sal_vm_canal5 0.355549688 -0.133621125 -0.04662081  0.022506824 -0.14639603
## sal_vm_canal2 0.366717606  0.048729711 -0.08218690  0.015839151  0.14993884
## sal_vm_canal8 0.002343343  0.006372857  0.01941851 -0.034607005  0.15363050
## sal_vm_otros  0.007285194  0.023670168  0.24532151  0.657392366  0.09991336
## sal_tx_canal5 0.127322078  0.091094294  0.59295464 -0.226633702 -0.20373390
## sal_tx_canal2 0.218138109  0.068249988  0.14034123 -0.048434239  0.03712224
## sal_tx_canal8 0.021668280  0.053766505  0.23075955 -0.201736318  0.15099951
## sal_tx_otros  0.011075602  0.024640324  0.29602421  0.629795232  0.07221945
## en_vm_otros_  0.322299875 -0.224979617 -0.05906251  0.034785163 -0.01017554
## en_tx_otros_  0.106727001  0.307190677 -0.12590279  0.088355427 -0.23795516
##                       PC6          PC7          PC8          PC9         PC10
## en_vm_canal1   0.38784352 -0.158015511  0.031190032 -0.244176757 -0.105757642
## en_vm_canal2  -0.14981641 -0.034504242 -0.002941420  0.206077140  0.225225036
## en_vm_canal3   0.41902490 -0.054678055  0.035633956  0.031051248 -0.006366434
## en_vm_canal4  -0.04879406  0.065205860 -0.061863365 -0.020648199 -0.161372706
## en_vm_canal5   0.18396294 -0.007619506  0.010779230  0.022846372  0.044592155
## en_tx_canal1  -0.36749841  0.058982604 -0.057909195 -0.496657067  0.316200015
## en_tx_canal2  -0.12587425 -0.108488905  0.041523555  0.321779379 -0.572382288
## en_tx_canal3   0.10933945  0.114410260 -0.034024746  0.362023347 -0.112299104
## en_tx_canal4  -0.04053301  0.077059198 -0.068586851 -0.058113415 -0.116324713
## en_tx_canal5  -0.04326276 -0.144373112  0.230030724 -0.170923173 -0.085522905
## sal_vm_canal5  0.18669821 -0.006814179  0.008649184  0.021112499  0.046441524
## sal_vm_canal2 -0.00644150 -0.045821833  0.012428190 -0.142945481  0.128301143
## sal_vm_canal8  0.11712594  0.781877402  0.590616547 -0.017626434  0.010102318
## sal_vm_otros   0.04095832  0.041137852 -0.059333702  0.014159818 -0.005855127
## sal_tx_canal5  0.01367655  0.018591611 -0.016655149 -0.129402957 -0.009948242
## sal_tx_canal2 -0.33799442 -0.070377237  0.164463846  0.529822797  0.444451488
## sal_tx_canal8  0.16692480  0.481973381 -0.727225701  0.121506884  0.138136008
## sal_tx_otros   0.04169115  0.017618524 -0.021902401  0.008097712  0.014567410
## en_vm_otros_  -0.09152651  0.062152440 -0.041063840 -0.189603343  0.018701110
## en_tx_otros_  -0.49624393  0.233409173 -0.137932019 -0.111838455 -0.461437218
##                       PC11         PC12         PC13         PC14         PC15
## en_vm_canal1   0.244983519 -0.076868729  0.227392762 -0.253507452  0.259041920
## en_vm_canal2  -0.733779203 -0.021298080  0.052792305 -0.354801761  0.162317314
## en_vm_canal3   0.181178574  0.066069076 -0.210840554 -0.046606806  0.159757171
## en_vm_canal4   0.040729356  0.025725848 -0.054347350  0.069695839  0.516078313
## en_vm_canal5  -0.049859626 -0.133249517  0.391517763  0.174090569 -0.285367875
## en_tx_canal1  -0.015861029  0.013137148 -0.005153170  0.561993318  0.127307880
## en_tx_canal2  -0.013133910  0.053841274 -0.112440903  0.371155474 -0.160374817
## en_tx_canal3  -0.155617204  0.100723364 -0.258779315  0.240420692  0.073983600
## en_tx_canal4   0.013297342  0.012600530 -0.024225234  0.032751845  0.414029786
## en_tx_canal5  -0.097872306 -0.019847792 -0.056238204 -0.020676874  0.039606046
## sal_vm_canal5 -0.047639635 -0.134561698  0.395587850  0.175561794 -0.285611190
## sal_vm_canal2 -0.003463341  0.087481855 -0.253303179 -0.178270558 -0.311099119
## sal_vm_canal8 -0.017523663 -0.003077326  0.005043460  0.001355643  0.002521448
## sal_vm_otros   0.001434063 -0.663600744 -0.222051286  0.014546280 -0.003942049
## sal_tx_canal5 -0.069044365 -0.014526828 -0.050418801 -0.019840603  0.031182920
## sal_tx_canal2  0.520178011 -0.049538124  0.090962104 -0.070959912  0.109624164
## sal_tx_canal8  0.061178064  0.014495336  0.008000959  0.007213890 -0.029435993
## sal_tx_otros   0.003743195  0.672295752  0.234301002 -0.007734474 -0.001142211
## en_vm_otros_   0.166777069  0.165464179 -0.485462477 -0.236533354 -0.340826480
## en_tx_otros_   0.143769655 -0.093356080  0.299984533 -0.367848658 -0.066839370
##                        PC16          PC17          PC18          PC19
## en_vm_canal1  -0.4554747473  0.1060774569 -0.0571497707  1.312923e-02
## en_vm_canal2   0.0973312044  0.1517185393 -0.1030954239  8.996487e-04
## en_vm_canal3   0.6778960289  0.0843106002 -0.0113631201 -1.095461e-02
## en_vm_canal4   0.0368471524 -0.4495182149 -0.5048161416 -1.958514e-02
## en_vm_canal5   0.0267974728  0.0396445355 -0.0643985751 -3.467268e-02
## en_tx_canal1   0.0235883333  0.0883747263 -0.0451777472 -4.487990e-03
## en_tx_canal2   0.1241789037  0.0680772535  0.0132591961  2.166125e-02
## en_tx_canal3  -0.5263892043 -0.0145532391  0.0160122640  7.560946e-03
## en_tx_canal4   0.0003809804  0.3272617773  0.6675951386  1.892525e-02
## en_tx_canal5  -0.0160306206  0.0047850827  0.0151394914 -6.655112e-01
## sal_vm_canal5  0.0293774798  0.0410383414 -0.0673642077  2.898858e-02
## sal_vm_canal2 -0.0483780641 -0.6540193087  0.4049367122 -3.842812e-03
## sal_vm_canal8  0.0080772555 -0.0072415633  0.0037986875  1.770073e-03
## sal_vm_otros  -0.0053432403  0.0001224729 -0.0009743556 -2.370944e-06
## sal_tx_canal5  0.0142207103 -0.0275674395 -0.0109960727  7.087737e-01
## sal_tx_canal2 -0.0261467939  0.0275080847  0.0049916138 -3.990052e-03
## sal_tx_canal8  0.0248266151 -0.0143279827  0.0119539773 -2.259741e-01
## sal_tx_otros   0.0038072093  0.0026770629 -0.0008674446 -1.120593e-04
## en_vm_otros_  -0.1216932174  0.4506043292 -0.3311267779  1.783503e-03
## en_tx_otros_   0.0922375963 -0.0308490189  0.0119341021  3.055133e-03
##                        PC20
## en_vm_canal1  -2.296608e-03
## en_vm_canal2   7.314391e-04
## en_vm_canal3  -1.254960e-03
## en_vm_canal4   1.049905e-03
## en_vm_canal5  -7.078796e-01
## en_tx_canal1  -1.396618e-03
## en_tx_canal2  -4.436135e-04
## en_tx_canal3   1.915983e-03
## en_tx_canal4   9.653443e-04
## en_tx_canal5   3.051265e-02
## sal_vm_canal5  7.048763e-01
## sal_vm_canal2  2.319367e-03
## sal_vm_canal8 -6.905418e-05
## sal_vm_otros  -2.696364e-05
## sal_tx_canal5 -3.227495e-02
## sal_tx_canal2 -1.027991e-05
## sal_tx_canal8  7.879954e-03
## sal_tx_otros  -8.026732e-05
## en_vm_otros_   6.812733e-04
## en_tx_otros_   1.037106e-05
#Qué % de variablidad es explicada para cada componente:
prop_varianza <- acprincipales$sdev^2 / sum(acprincipales$sdev^2)
prop_varianza*100
##  [1] 31.252887750 12.500596548 10.388445237  7.669331760  5.838844620
##  [6]  5.483241744  5.130915334  4.684397232  3.750698974  3.142782372
## [11]  2.700662212  1.958770004  1.931693356  1.613379883  0.959234753
## [16]  0.539396539  0.264017105  0.162477498  0.026904577  0.001322502
ggplot(data = data.frame(prop_varianza, pc = 1:20),
       aes(x = pc, y = prop_varianza)) +
  geom_col(width = 0.3) +
  scale_y_continuous(limits = c(0,1)) +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. de varianza explicada")

prop_varianza_acum <- cumsum(prop_varianza)
prop_varianza_acum
##  [1] 0.3125289 0.4375348 0.5414193 0.6181126 0.6765011 0.7313335 0.7826426
##  [8] 0.8294866 0.8669936 0.8984214 0.9254280 0.9450157 0.9643327 0.9804665
## [15] 0.9900588 0.9954528 0.9980930 0.9997177 0.9999868 1.0000000
ggplot(data = data.frame(prop_varianza_acum, pc = 1:20),
       aes(x = pc, y = prop_varianza_acum, group = 1)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. varianza explicada acumulada")

#Con las 9 primeras compoentes principales se explica el 87% de la variabilidad total

#Datos proyectados con las 9 primeras componentes principales
datos_proyectados=acprincipales$x#Acá está con todas sus componetes principales
datos_proyectados_reducidos=datos_proyectados[,1:9]
#Los vectores propios son los siguientes:
vec_propios=acprincipales$rotation
val_propios=lambdas_t
#Tomaremos únicamente los necesarios
D_red_t<-vec_propios[,1:9]
lambdas_red_t<-val_propios[1:9]
#Se recontruye la matriz de covarianzas escalada
Sigma_rec_t<-D_red_t%*%diag(lambdas_red_t)%*%t(D_red_t)

Reconstrucción de los datos originales a partir de la proyección en las nueve primeras componentes principales

datos_reconstruidos_esc_cent_t<-datos_proyectados_reducidos%*%t(D_red_t)

#Ahora, se hace el clustering con las 9 compnentes principales

#Recordemos que éstas están almacenadas en la varianle datos_proyectados_reducidos
set.seed(1234)
wcss <- vector()
for(i in 1:20){
  wcss[i] <- sum(kmeans(datos_proyectados_reducidos, i)$withinss)
}
#Se verifica el número de centroides óptimo
ggplot() + geom_point(aes(x = 1:20, y = wcss), color = 'blue') + 
  geom_line(aes(x = 1:20, y = wcss), color = 'blue') + 
  ggtitle("Método del Codo") + 
  xlab('Cantidad de Centroides k') + 
  ylab('WCSS')

#Enotro gráfico más bonito
set.seed(3) # Se fija la semilla para obtener resultados reproducibles
# Cálculo de los grupos
centers <- 2:10 # este es el valor de K
resultados <- vector(mode="list",length = 10) # en esta lista se almacenan los resultados de cada agrupamiento
for (i in 1:length(centers)){
  resultados[[i]] <- kmeans(x=datos_proyectados_reducidos,centers=centers[i],nstart = 3)
}




# Extracción de la métrica de desempeño ("withinss") para cada K:
metrica_cl <- do.call("rbind",lapply(resultados,"[[",5))
num_centros <- 2:10
res_num_cen <- data.frame(num_centros,metrica_cl)
grph_metrica_cl <- ggplot(res_num_cen,aes(x=num_centros,xend=num_centros,y=0,yend=metrica_cl))
grph_metrica_cl + geom_point(aes(x=num_centros,y=metrica_cl)) +  geom_segment() + theme_bw() + labs(title = "Desempeño del agrupamiento \n en función de K",
                                      x = "K (cantidad de centros)",
                                      y = "Métrica de desempeño")

#Por ahora se seleccionan 5 grupos

set.seed(123)
for (i in 2:8){wines_K2 <- kmeans(datos_proyectados_reducidos, centers =i , nstart = 25)
print(wines_K2$size)

  
}
## [1]    6 2227
## [1]   24 2208    1
## [1] 2180    6   46    1
## [1]    6    1 2171   53    2
## [1]   52    2    1    1    6 2171
## [1]    6  203 2003    2    1   17    1
## [1]    1 2004    1   11    6    6  202    2
set.seed(123)
kmenas=kmeans(datos_proyectados_reducidos, centers =5 , nstart = 25)
fviz_cluster(kmenas, data = datos_numericos)

acprincipales$scale
##  en_vm_canal1  en_vm_canal2  en_vm_canal3  en_vm_canal4  en_vm_canal5 
##  8.628981e+09  2.420627e+10  1.897338e+09  9.285151e+09  2.270161e+10 
##  en_tx_canal1  en_tx_canal2  en_tx_canal3  en_tx_canal4  en_tx_canal5 
##  7.015267e+03  2.792873e+02  1.222524e+04  7.483593e+04  1.865614e+01 
## sal_vm_canal5 sal_vm_canal2 sal_vm_canal8  sal_vm_otros sal_tx_canal5 
##  2.276978e+10  3.196322e+10  5.087747e+08  1.648173e+07  1.979337e+01 
## sal_tx_canal2 sal_tx_canal8  sal_tx_otros  en_vm_otros_  en_tx_otros_ 
##  2.764380e+03  9.716711e+01  1.360357e+01  2.055489e+10  1.187329e+04
biplot(x=acprincipales,scale = 0)

dfPCA <- as.data.frame(acprincipales$x)
dfPCA <- cbind(dfPCA, kmenas$cluster )
dfPCA$`kmenas$cluster` <- as.factor(dfPCA$`kmenas$cluster`)
dfPCA$`kmenas$cluster` <- fct_recode(dfPCA$`kmenas$cluster`, "Grupo 1" = "1", 
                                     "Grupo 2" = "2",
                                     "Grupo 3" = "3",
                                     "Grupo 4" = "4",
                                     "Grupo 5" = "5" )
plotly::plot_ly(dfPCA, x=~PC1, y=~PC2, z=~PC3, color=dfPCA$`kmenas$cluster` )
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode